Out of 30 samples, we selected 18 for this study. These are the normal tissue samples form the control, the UVA and the UVA+SFN treatment groups. normal tissue samples from the UVB_UA groups as well as tumor samples were excluded from this analysis.
First, we removed 7,148 genes with zero counts in > 80% (> 14 out of 18) of samples. 17,273 out of 24,421 genes left.
[1] 7148
[1] 17273
Next, we noramized the counts. To convert number of hits to the relative abundane of genes in each sample, we used transcripts per kilobase million (TPM) normalization, which is as following for the j-th sample:
1. normilize for gene length: a[i, j] = 1,000*count[i, j]/gene[i, j] length(bp)
2. normalize for seq depth (i.e. total count): a(i, j)/sum(a[, j])
3. multiply by one million
A very good comparison of normalization techniques can be found at the following video:
RPKM, FPKM and TPM, clearly explained
After the normalization, each sample’s total is 1M:
02w_CON_0 02w_CON_1 02w_SFN_0 02w_SFN_1 02w_UVB_0 02w_UVB_1 15w_CON_0 15w_CON_1 15w_SFN_0
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
15w_SFN_1 15w_UVB_0 15w_UVB_1 25w_CON_0 25w_CON_1 25w_SFN_0 25w_SFN_1 25w_UVB_0 25w_UVB_1
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
# Separate top 100 abundant genes
tmp <- melt.data.table(data = tmp,
id.vars = 1:2,
measure.vars = 3:ncol(tmp),
variable.name = "Sample",
value.name = "TPM")
tmp$Week <- substr(x = tmp$Sample,
start = 1,
stop = 3)
tmp$Week <- factor(tmp$Week,
levels = unique(tmp$Week))
tmp$Treatment <- substr(x = tmp$Sample,
start = 5,
stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
levels = c("CON",
"UVB",
"SFN"))
tmp$Replica <- substr(x = tmp$Sample,
start = 9,
stop = 9)
tmp$Replica <- factor(tmp$Replica,
levels = 0:1)
# Plot top 100 abundant genes
p2 <- ggplot(tmp,
aes(x = TPM,
y = Geneid,
fill = Treatment,
shape = Week)) +
# facet_wrap(~ Sex, nrow = 1) +
geom_point(size = 3,
alpha = 0.5) +
geom_vline(xintercept = 1,
linetype = "dashed")
ggplotly(p2)
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[1:100]])
tmp <- melt.data.table(data = tmp,
id.vars = 1:2,
measure.vars = 3:ncol(tmp),
variable.name = "Sample",
value.name = "TPM")
tmp$Week <- substr(x = tmp$Sample,
start = 1,
stop = 3)
tmp$Week <- factor(tmp$Week,
levels = unique(tmp$Week))
tmp$Treatment <- substr(x = tmp$Sample,
start = 5,
stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
levels = c("CON",
"UVB",
"SFN"))
tmp$Replica <- substr(x = tmp$Sample,
start = 9,
stop = 9)
tmp$Replica <- factor(tmp$Replica,
levels = 0:1)
# Plot top 100 abundant genes
p3 <- ggplot(tmp,
aes(x = TPM,
y = Geneid,
fill = Treatment,
shape = Week)) +
# facet_wrap(~ Sex, nrow = 1) +
geom_point(size = 3,
alpha = 0.5) +
geom_vline(xintercept = 1,
linetype = "dashed")
ggplotly(p3)
dmeta <- data.table(Sample = colnames(dt1)[-c(1:2)])
dmeta$time <- substr(x = dmeta$Sample,
start = 1,
stop = 3)
dmeta$time <- factor(dmeta$time,
levels = c("02w",
"15w",
"25w"))
dmeta$Week <- factor(dmeta$time,
levels = c("02w",
"15w",
"25w"),
labels = c("Week 2",
"Week 15",
"Week 25"))
dmeta$trt <- substr(x = dmeta$Sample,
start = 5,
stop = 7)
dmeta$trt <- factor(dmeta$trt,
levels = c("CON",
"UVB",
"SFN"))
dmeta$Treatment <- factor(dmeta$trt,
levels = c("CON",
"UVB",
"SFN"),
labels = c("Negative Control",
"Positive Control (UVB)",
"Sulforaphane (SFN)"))
dmeta$Replica <- substr(x = dmeta$Sample,
start = 9,
stop = 9)
dmeta$Replica <- factor(dmeta$Replica,
levels = 0:1)
datatable(dmeta,
options = list(pageLength = nrow(dmeta)))
NOTE: the distributions are skewed. To make them symmetric, log transformation is often applied. However, there is an issue of zeros. In this instance, we added a small values lambda[i] equal to 1/10 of the smallest non-zero value of i-th gene.
dm.tpm <- as.matrix(tpm[, -c(1:2), with = FALSE])
rownames(dm.tpm) <- tpm$Geneid
# # Remove 02w_CON_1 sample and redo PCA
# dm.tpm <- dm.tpm[, colnames(dm.tpm) != "02w_CON_1"]
# dmeta <- dmeta[dmeta$Sample != "02w_CON_1", ]
# Add lambdas to all values, then take a log
dm.ltpm <- t(apply(X = dm.tpm,
MARGIN = 1,
FUN = function(a) {
lambda <- min(a[a > 0])/10
log(a + lambda)
}))
# PCA----
m1 <- prcomp(t(dm.ltpm),
center = TRUE,
scale. = TRUE)
s1 <- summary(m1)
s1
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 70.7928 56.9107 50.8898 28.84564 26.51968 24.81005 23.85276 22.63644
Proportion of Variance 0.2901 0.1875 0.1499 0.04817 0.04072 0.03564 0.03294 0.02967
Cumulative Proportion 0.2901 0.4777 0.6276 0.67575 0.71647 0.75211 0.78504 0.81471
PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
Standard deviation 20.97344 20.20442 19.24099 19.01279 18.73783 18.53642 17.87923 17.65132
Proportion of Variance 0.02547 0.02363 0.02143 0.02093 0.02033 0.01989 0.01851 0.01804
Cumulative Proportion 0.84018 0.86381 0.88524 0.90617 0.92650 0.94639 0.96490 0.98293
PC17 PC18
Standard deviation 17.16891 2.134e-13
Proportion of Variance 0.01707 0.000e+00
Cumulative Proportion 1.00000 1.000e+00
imp <- data.table(PC = colnames(s1$importance),
Variance = 100*s1$importance[2, ],
Cumulative = 100*s1$importance[3, ])
imp$PC <- factor(imp$PC,
levels = imp$PC)
p1 <- ggplot(imp,
aes(x = PC,
y = Variance)) +
geom_bar(stat = "identity",
fill = "grey",
color = "black") +
geom_line(aes(y = rescale(Cumulative,
to = c(min(Cumulative)*30/100,
30)),
group = rep(1, nrow(imp)))) +
geom_point(aes(y = rescale(Cumulative,
to = c(min(Cumulative)*30/100,
30)))) +
scale_y_continuous("% Variance Explained",
breaks = seq(0, 30, by = 5),
labels = paste(seq(0, 30, by = 5),
"%",
sep = ""),
sec.axis = sec_axis(trans = ~.,
name = "% Cumulative Variance",
breaks = seq(0, 30, length.out = 5),
labels = paste(seq(0, 100, length.out = 5),
"%",
sep = ""))) +
scale_x_discrete("") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
p1
# Biplot while keep only the most important variables (Javier)----
# Select PC-s to pliot (PC1 & PC2)
choices <- c(1:3)
# Scores, i.e. points (df.u)
dt.scr <- data.table(m1$x[, choices])
# Add grouping variables
dt.scr$trt <- dmeta$trt
dt.scr$time <- dmeta$time
dt.scr$sample <- dmeta$Sample
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[choices],
sprintf('(%0.1f%% explained var.)',
100*m1$sdev[choices]^2/sum(m1$sdev^2)))
p1 <- ggplot(data = dt.scr,
aes(x = PC1,
y = PC2,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2])
ggplotly(p1)
p1 <- ggplot(data = dt.scr,
aes(x = PC1,
y = PC3,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[3])
ggplotly(p1)
p1 <- ggplot(data = dt.scr,
aes(x = PC2,
y = PC3,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[2]) +
scale_y_continuous(u.axis.labs[3])
ggplotly(p1)
scatterplot3js(x = dt.scr$PC1,
y = dt.scr$PC2,
z = dt.scr$PC3,
color = as.numeric(dt.scr$trt),
renderer = "auto",
pch = dt.scr$sample,
size = 0.1)
tmp <- tpm[Geneid %in% levels(tpm$Geneid)[(nrow(tpm) - 8):nrow(tpm)]]
tmp <- melt.data.table(data = tmp,
id.vars = 1,
measure.vars = 3:ncol(tmp),
variable.name = "Sample",
value.name = "TPM")
tmp <- merge(dmeta,
tmp,
by = "Sample")
p1 <- ggplot(tmp,
aes(x = Week,
y = TPM,
colour = Treatment,
shape = Replica,
group = Treatment)) +
facet_wrap(~ Geneid,
scales = "free_y") +
geom_point(size = 5,
position = position_dodge(0.5)) +
ggtitle(gene)
plot(p1)
Most abundant molecules in sample 02w_CON_1 are in larger proportions relative the total of the sample (as measured by TPM) compared to all other samples. This could be because less genes were identified in that sample compared to the rest:
colSums(tpm[, -c(1, 2)] > 0)
02w_CON_0 02w_CON_1 02w_SFN_0 02w_SFN_1 02w_UVB_0 02w_UVB_1 15w_CON_0 15w_CON_1 15w_SFN_0
15634 15015 15431 15133 15233 15299 15495 15901 15761
15w_SFN_1 15w_UVB_0 15w_UVB_1 25w_CON_0 25w_CON_1 25w_SFN_0 25w_SFN_1 25w_UVB_0 25w_UVB_1
15793 15958 15828 15543 15344 16098 15732 16264 15979
Exclude sample 02w_CON_1 form the analysis.
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] scales_1.0.0 threejs_0.3.1 igraph_1.2.4.1
[4] plotly_4.9.0 ggplot2_3.1.1 readxl_1.3.1
[7] DESeq2_1.24.0 SummarizedExperiment_1.14.0 DelayedArray_0.10.0
[10] BiocParallel_1.17.18 matrixStats_0.54.0 Biobase_2.44.0
[13] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0 IRanges_2.18.1
[16] S4Vectors_0.22.0 BiocGenerics_0.30.0 DT_0.6
[19] data.table_1.12.2 knitr_1.22
loaded via a namespace (and not attached):
[1] bitops_1.0-6 bit64_0.9-7 RColorBrewer_1.1-2 httr_1.4.0
[5] tools_3.6.0 backports_1.1.4 R6_2.4.0 rpart_4.1-15
[9] Hmisc_4.2-0 DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1
[13] nnet_7.3-12 withr_2.1.2 tidyselect_0.2.5 gridExtra_2.3
[17] bit_1.1-14 compiler_3.6.0 htmlTable_1.13.1 labeling_0.3
[21] checkmate_1.9.3 genefilter_1.66.0 stringr_1.4.0 digest_0.6.19
[25] foreign_0.8-71 rmarkdown_1.13 XVector_0.24.0 base64enc_0.1-3
[29] pkgconfig_2.0.2 htmltools_0.3.6 htmlwidgets_1.3 rlang_0.3.4
[33] rstudioapi_0.10 RSQLite_2.1.1 shiny_1.3.2 jsonlite_1.6
[37] crosstalk_1.0.0 acepack_1.4.1 dplyr_0.8.1 RCurl_1.95-4.12
[41] magrittr_1.5 GenomeInfoDbData_1.2.1 Formula_1.2-3 Matrix_1.2-17
[45] Rcpp_1.0.1 munsell_0.5.0 yaml_2.2.0 stringi_1.4.3
[49] zlibbioc_1.30.0 plyr_1.8.4 grid_3.6.0 blob_1.1.1
[53] promises_1.0.1 crayon_1.3.4 lattice_0.20-38 splines_3.6.0
[57] annotate_1.62.0 locfit_1.5-9.1 pillar_1.4.1 geneplotter_1.62.0
[61] XML_3.98-1.19 glue_1.3.1 evaluate_0.14 latticeExtra_0.6-28
[65] httpuv_1.5.1 cellranger_1.1.0 gtable_0.3.0 purrr_0.3.2
[69] tidyr_0.8.3 assertthat_0.2.1 xfun_0.7 mime_0.6
[73] xtable_1.8-4 later_0.8.0 survival_2.44-1.1 viridisLite_0.3.0
[77] tibble_2.1.2 AnnotationDbi_1.46.0 memoise_1.1.0 cluster_2.0.9